about¶

evalutions of pseudobulk rna-seq samples

In [1]:
# data quality
mets_only_tcga = True
gene_sparsity_ceiling_tcga = 0.5
# pseudobulk generation parameters
n_cells_per_cell_type = 10
malignant_from_one_sample = True
In [2]:
import pathlib

figure_path = pathlib.Path("figures-8e")
figure_path.mkdir(parents=True, exist_ok=True)
figure_path
Out[2]:
PosixPath('figures-8e')

imports¶

In [3]:
import logging

import helpers
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.io
In [4]:
# plotly.io.renderers.default = "plotly_mimetype+notebook_connected"
In [5]:
handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logging.getLogger().addHandler(handler)
In [6]:
logging.getLogger("gcsfs").setLevel("INFO")
logging.getLogger("google.cloud.bigquery").setLevel("DEBUG")
logging.getLogger("helpers").setLevel("DEBUG")
logging.getLogger("helpers.creating_mixtures").setLevel("INFO")
logging.getLogger("pandas").setLevel("DEBUG")
logging.getLogger("pyarrow").setLevel("DEBUG")
In [7]:
logger = logging.getLogger(__name__)
logger.setLevel("DEBUG")
In [8]:
rng = np.random.default_rng(seed=0)

loading data¶

In [9]:
%%time
bulk_tcga_skcm = helpers.datasets.load_tcga_skcm_hg19_scaled_estimate_firebrowse()
bulk_tcga_skcm *= 1_000_000 / bulk_tcga_skcm.sum()
2022-05-20 20:34:35,057 - helpers.datasets - DEBUG - reading gs://liulab/firebrowse.org/SKCM.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt
CPU times: user 13.4 s, sys: 1.4 s, total: 14.8 s
Wall time: 22.9 s
In [10]:
%%time
fractions_tcga_skcm = helpers.datasets.load_tcga_skcm_fractions_from_csx()
2022-05-20 20:34:57,984 - helpers.datasets - DEBUG - loading TCGA SKCM fractions estimated by CIBERSORTx
CPU times: user 12 ms, sys: 3.96 ms, total: 15.9 ms
Wall time: 90.7 ms
In [11]:
%%time
sc_jerby_arnon, sc_metadata_jerby_arnon = helpers.datasets.load_jerby_arnon(
    ref_genome="hg19", units="tpm"
)
sc_jerby_arnon *= 1_000_000 / sc_jerby_arnon.sum()
2022-05-20 20:34:58,082 - helpers.datasets - DEBUG - loading Jerby-Arnon scRNA-seq data
2022-05-20 20:35:31,823 - helpers.datasets - DEBUG - loading Jerby-Arnon metadata
CPU times: user 1min 10s, sys: 16.9 s, total: 1min 27s
Wall time: 34.7 s

apply filters to data¶

In [12]:
# determine gene exclusions

exclusions_genes = pd.DataFrame(index=bulk_tcga_skcm.index.union(sc_jerby_arnon.index))

## sparsity in tcga
genes_sparse_tcga_skcm = bulk_tcga_skcm[
    (bulk_tcga_skcm == 0).mean(axis=1) > gene_sparsity_ceiling_tcga
].index
exclusions_genes["sparse_in_tcga_skcm"] = exclusions_genes.index.map(
    lambda g: g in genes_sparse_tcga_skcm
)

## genes not in both cohorts
genes_in_both = bulk_tcga_skcm.index.intersection(sc_jerby_arnon.index)
exclusions_genes["not_in_both_cohorts"] = exclusions_genes.index.map(
    lambda gene_name: gene_name not in genes_in_both
)

good_genes = exclusions_genes.loc[~exclusions_genes.any(axis=1)].index

https://console.cloud.google.com/bigquery?authuser=1&project=keen-dispatch-316219

In [13]:
# determine tcga sample exclusions

exclusions_tcga_samples = pd.DataFrame(index=bulk_tcga_skcm.columns)

## limit to metastases

if mets_only_tcga:
    query_text = """
        SELECT aliquot_barcode
        FROM `keen-dispatch-316219.gdc_tcga_skcm_subset.aliquot2caseIDmap_current`
        WHERE sample_type_name = 'Metastatic'
    """
    metastatic_aliquot_barcodes = pd.read_gbq(query_text)["aliquot_barcode"].values
    exclusions_tcga_samples["is_not_metastatic"] = exclusions_tcga_samples.index.map(
        lambda sample: sample not in metastatic_aliquot_barcodes
    )

good_tcga_samples = exclusions_tcga_samples.loc[
    ~exclusions_tcga_samples.any(axis=1)
].index
In [14]:
# apply exclusions
sc_jerby_arnon_cleaned = sc_jerby_arnon.loc[good_genes]
sc_jerby_arnon_cleaned *= 1_000_000 / sc_jerby_arnon_cleaned.sum()
bulk_tcga_skcm_cleaned = bulk_tcga_skcm.loc[good_genes][good_tcga_samples]
bulk_tcga_skcm_cleaned *= 1_000_000 / bulk_tcga_skcm_cleaned.sum()
fractions_tcga_skcm_cleaned = fractions_tcga_skcm.loc[good_tcga_samples]

print(
    "without exclusions:",
    [df.shape for df in (sc_jerby_arnon, bulk_tcga_skcm, fractions_tcga_skcm)],
)

print(
    "with exclusions:",
    [
        df.shape
        for df in (
            sc_jerby_arnon_cleaned,
            bulk_tcga_skcm_cleaned,
            fractions_tcga_skcm_cleaned,
        )
    ],
)
without exclusions: [(23686, 7186), (20501, 473), (473, 9)]
with exclusions: [(16063, 7186), (16063, 368), (368, 9)]

compute pseudobulks¶

In [15]:
bulk_pseudo, bulk_pseudo_cell_type_geps = helpers.creating_mixtures.make_mixtures(
    sc_jerby_arnon_cleaned,
    sc_metadata_jerby_arnon,
    fractions_tcga_skcm_cleaned.rename(index=lambda sample: f"pseudo_like_{sample}"),
    n_cells_per_gep=n_cells_per_cell_type,
    normalization_factor=1_000_000,
    malignant_from_one_sample=malignant_from_one_sample,
    rng=rng,
)

funcs¶

In [16]:
def calc_gene_sparsity(df):
    return (df == 0).mean(axis=1)


def calc_gene_density(df):
    return (df != 0).mean(axis=1)

figures¶

bulk tcga¶

sparsity¶

In [17]:
x = calc_gene_sparsity(bulk_tcga_skcm)

fig = go.Figure(
    data=[
        go.Histogram(
            x=x,
            xbins=dict(start=0.0, end=1.0, size=0.1),
        )
    ]
)
fig.update_layout(bargap=0.1)
fig.update_xaxes(range=[0, 1])
fig.update_yaxes(range=[0, 20_000])
fig.update_layout(
    title="Gene sparsity of TCGA SKCM bulk RNA-seq (hg19 TPM) <br>(original data)",
    xaxis_title="sparsity (binned)",
    yaxis_title="count of genes",
)
fig.write_image(
    figure_path / "bulk_tcga_skcm_sparsity.png", scale=2.0, width=500, height=500
)
fig
In [18]:
x = calc_gene_sparsity(bulk_tcga_skcm_cleaned)

fig = go.Figure(
    data=[
        go.Histogram(
            x=x,
            xbins=dict(start=0.0, end=1.0, size=0.1),
        )
    ]
)
fig.update_layout(bargap=0.1)
fig.update_xaxes(range=[0, 1])
fig.update_yaxes(range=[0, 20_000])
fig.update_layout(
    title="Gene sparsity of TCGA SKCM bulk RNA-seq (hg19 TPM) <br>(metastatic only, excl >50% sparse genes)",
    xaxis_title="sparsity (binned)",
    yaxis_title="count of genes",
)
fig.write_image(
    figure_path / "bulk_tcga_skcm_cleaned_sparsity.png",
    scale=2.0,
    width=500,
    height=500,
)
fig

means¶

In [19]:
fig = go.Figure(
    data=[
        go.Histogram(
            x=x,
            xbins=dict(start=-2, end=5, size=0.5),
        )
    ]
)
fig.update_layout(bargap=0.1)
fig.update_xaxes(range=[-2, 5])
# fig.update_yaxes(range=[0, 10_000])
fig.update_layout(
    title="Distribution of mean gene expression: TCGA SKCM bulk RNA-seq <br>(hg19 TPM, metastatic only, excl >50% sparse genes)",
    xaxis_title="Gene expression (binned, log10-transformed)",
    yaxis_title="count of genes",
)


fig.write_image(
    figure_path / "bulk_tcga_skcm_cleaned_dist_mean.png",
    scale=1.5,
    width=800,
    height=800,
)
fig

https://plotly.com/python/distplot/

In [20]:
import plotly.figure_factory as ff

x = np.log(bulk_tcga_skcm_cleaned.mean(axis="columns")) / np.log(10)
hist_data = [x]
group_labels = ["TCGA SKCM bulk RNA-seq"]  # name of the dataset

fig = ff.create_distplot(hist_data, group_labels, bin_size=0.2)
fig.update_layout(bargap=0.1, title="Distribution of mean gene expression")
fig.update_xaxes(showgrid=True, title="Mean gene expression (log10 scale)")
fig.update_yaxes(showgrid=True)
fig.update_layout(legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01))

fig.write_image(
    figure_path / "bulk_tcga_skcm_cleaned_dist_mean__create_distplot.png",
    scale=1.5,
    width=800,
    height=800,
)
fig

stddev¶

sc jerby-arnon¶

sparsity¶

In [21]:
x = calc_gene_sparsity(sc_jerby_arnon)

fig = go.Figure(
    data=[
        go.Histogram(
            x=x,
            xbins=dict(start=0.0, end=1.0, size=0.1),
        )
    ]
)
fig.update_layout(bargap=0.1)
fig.update_xaxes(range=[0, 1])
fig.update_yaxes(range=[0, 20_000])
fig.update_layout(
    title="Gene sparsity of Jerby-Arnon scRNA-seq (hg19 TPM)",
    xaxis_title="sparsity (binned)",
    yaxis_title="count of genes",
)
fig.write_image(
    figure_path / "sc_jerby_arnon_sparsity.png", scale=2.0, width=500, height=500
)
fig

appendix - altair plots¶

In [22]:
import altair as alt

alt.data_transformers.disable_max_rows()
Out[22]:
DataTransformerRegistry.enable('default')
In [23]:
x = calc_gene_sparsity(bulk_tcga_skcm).to_frame(name="sparsity").reset_index()

alt.Chart(x).mark_bar().encode(
    alt.X(
        "sparsity:Q",
        bin=alt.Bin(extent=[0, 1]),
    ),
    y="count()",
)
Out[23]:
In [24]:
x = (
    calc_gene_sparsity(bulk_tcga_skcm.loc[good_genes][good_tcga_samples])
    .to_frame(name="sparsity")
    .reset_index()
)

alt.Chart(x).mark_bar().encode(
    alt.X(
        "sparsity:Q",
        bin=alt.Bin(extent=[0, 1]),
    ),
    y="count()",
)
Out[24]:
In [25]:
x = calc_gene_sparsity(sc_jerby_arnon).to_frame(name="sparsity").reset_index()

chart = (
    alt.Chart(x)
    .mark_bar()
    .encode(
        alt.X(
            "sparsity:Q",
            bin=alt.Bin(extent=[0, 1]),
        ),
        y="count()",
    )
)

chart.save(figure_path / "sc_jerby_arnon_sparsity.png", scale_factor=5.0)

chart
Out[25]:
In [ ]:
 
In [ ]: